###########################
#Packages
###########################
library(cowplot)
library(ggplot2)
library(tidytext)
library(quanteda)
library(SnowballC)
library(stm)
library(gridExtra)
library(rio)
library(tidyverse)


###########################
#Data
###########################
#import data
lucid <- import("./Data/lucid_coded_all.rda")

#subset
reps <- lucid %>%
  filter(rep_extreme == 1) %>%
  select(id, rep_extreme_text)

dems <- lucid %>%
  filter(dem_extreme == 1) %>%
  select(id, dem_extreme_text)

###########################
#Total Words and Substantive
###########################

####total words
rep_words <- reps %>%
  unnest_tokens(word, rep_extreme_text) %>%
  count(id, word, sort=T)

rep_total <- rep_words %>%
  group_by(id) %>%
  summarize(total=sum(n))

dem_words <- dems %>%
  unnest_tokens(word, dem_extreme_text) %>%
  count(id, word, sort=T)

dem_total <- dem_words %>%
  group_by(id) %>%
  summarize(total=sum(n))

mean(rep_total$total)
median(rep_total$total)
IQR(rep_total$total)
quantile(rep_total$total, c(0.25, 0.75))

mean(dem_total$total)
median(dem_total$total)
IQR(dem_total$total)
quantile(dem_total$total, c(0.25, 0.75))


###Substantive
summarytools::freq(lucid$d_substantive)
summarytools::freq(lucid$r_substantive)

########################
#Figure
#########################


######Cleaning
data(stop_words)

rep_df <- reps %>%
  unnest_tokens(word, rep_extreme_text) %>%
  anti_join(stop_words) %>%
  mutate(word_stem = wordStem(word))

dem_df <- dems %>%
  unnest_tokens(word, dem_extreme_text) %>%
  anti_join(stop_words) %>%
  mutate(word_stem = wordStem(word))

rp <- rep_df %>%
  filter(word_stem != "republican", word_stem != "parti", word_stem !="extrem") %>%
  count(word, sort = T) %>%
  mutate(word = reorder(word, n)) %>%
  top_n(15) %>%
  ggplot(aes(word, n)) +
  geom_col() + 
  coord_flip() + 
  theme_minimal() + 
  labs(x = NULL, y = "Word Count", title="Republican Party Too Extreme References") + theme_bw(16)

dp <- dem_df %>%
  filter(word_stem != "democrat", word_stem != "parti", word_stem != "extrem") %>%
  count(word, sort = T) %>%
  mutate(word = reorder(word, n)) %>%
  top_n(15) %>%
  ggplot(aes(word, n)) +
  geom_col() + 
  coord_flip() + 
  theme_minimal() + 
  labs(x = NULL, y = "Word Count", title="Democratic Party Too Extreme References") + theme_bw(16)

both <- plot_grid(rp, dp)

#########
#Comparing Relative Use
#https://www.tidytextmining.com/twitter.html
#########


rep_df1 <- reps %>%
  unnest_tokens(word, rep_extreme_text) %>%
  anti_join(stop_words) %>%
  mutate(word_stem = wordStem(word)) %>%
  filter(!(word_stem %in% c("republican","rebublican", "replublian", "republican'", "republicansar", "republician", 
                            "parti", "extrem"))) %>%
  mutate(party = "Republican")

dem_df1 <- dems %>%
  unnest_tokens(word, dem_extreme_text) %>%
  anti_join(stop_words) %>%
  mutate(word_stem = wordStem(word)) %>%
  filter(!(word_stem %in% c("parti", "democrat", "dem", "democraft", "democrst", "extrem"))) %>%
  mutate(party = "Democratic")

both_df1 <- bind_rows(rep_df1, dem_df1)

word_ratios1 <- both_df1 %>%
  count(word, party) %>%
  group_by(word) %>%
  filter(sum(n) >= 10) %>%
  ungroup() %>%
  tidyr::spread(party, n, fill = 0) %>%
  mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
  mutate(logratio = log(Democratic / Republican)) %>%
  arrange(desc(logratio))

word_compare <- word_ratios1 %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) + 
  geom_col(show.legend=FALSE) + 
  coord_flip() + 
  theme_bw(16) + 
  labs(title = "Comparison of Word Use", 
       y = "logs odds ratio (Democratic/Republican)", 
       x = "Word") + 
  scale_fill_discrete(name = "", labels = c("Democratic", "Republican")) +
  scale_fill_manual(values=c("gray45", "black")) 


all_three <- plot_grid(both, word_compare, labels=c("A", "B"), 
                           nrow=2)

ggsave("Figure5.png", plot=all_three, height=10, width=14, 
       dpi=600)




